{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 下载数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import subprocess\n",
    "import os\n",
    "\n",
    "result = subprocess.run('bash -c \"source /etc/network_turbo && env | grep proxy\"', shell=True, capture_output=True, text=True)\n",
    "output = result.stdout\n",
    "for line in output.splitlines():\n",
    "    if '=' in line:\n",
    "        var, value = line.split('=', 1)\n",
    "        os.environ[var] = value\n",
    "\n",
    "from huggingface_hub import notebook_login\n",
    "\n",
    "notebook_login()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "reasoning_dataset = load_dataset(\"unsloth/OpenMathReasoning-mini\", split = \"cot\",cache_dir='./data/reason')\n",
    "\n",
    "non_reasoning_dataset = load_dataset(\"mlabonne/FineTome-100k\", split = \"train\",cache_dir='./data/no_reason')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 处理成llama-facctory需要的格式"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "from datasets import Dataset\n",
    "import json\n",
    "\n",
    "# 处理推理数据集（带推理链）\n",
    "def process_reason_data(ds, num_samples, start_idx=0):\n",
    "    samples = ds.select(range(start_idx, start_idx + num_samples))\n",
    "    processed = []\n",
    "    for item in samples:\n",
    "        # 将chain_of_thought用<think>标签包裹，然后与output拼接\n",
    "        cot = \"\\n\".join(item[\"chain_of_thought\"])\n",
    "        output = f\"<think>{cot}</think>\\n{item['output']}\"\n",
    "        processed.append({\n",
    "            \"instruction\": item[\"input\"],\n",
    "            \"input\": \"\",\n",
    "            \"output\": output\n",
    "        })\n",
    "    return processed\n",
    "\n",
    "# 处理非推理数据集\n",
    "def process_no_reason_data(ds, num_samples, start_idx=0):\n",
    "    samples = ds.select(range(start_idx, start_idx + num_samples))\n",
    "    processed = []\n",
    "    for item in samples:\n",
    "        processed.append({\n",
    "            \"instruction\": item[\"question\"]+'<think>',\n",
    "            \"input\": \"\",\n",
    "            \"output\": item[\"answer\"]\n",
    "        })\n",
    "    return processed\n",
    "\n",
    "# 创建推理训练集（前5000条）\n",
    "train_reason = process_reason_data(ds_reason, 5000)\n",
    "reason_train_dataset = Dataset.from_list(train_reason)\n",
    "\n",
    "# 创建非推理训练集（前1000条）\n",
    "train_no_reason = process_no_reason_data(ds_no_reason, 1000)\n",
    "no_reason_train_dataset = Dataset.from_list(train_no_reason)\n",
    "\n",
    "# 创建测试集（不重叠的数据）\n",
    "test_reason = process_reason_data(ds_reason, 30, 5000)  # 取5000-5029\n",
    "test_no_reason = process_no_reason_data(ds_no_reason, 20, 1000)  # 取1000-1019\n",
    "test_data = test_reason + test_no_reason\n",
    "random.shuffle(test_data)  # 打乱顺序\n",
    "test_dataset = Dataset.from_list(test_data)\n",
    "\n",
    "# 保存为三个独立的JSON文件\n",
    "def save_to_json(dataset, filename):\n",
    "    with open(filename, 'w', encoding='utf-8') as f:\n",
    "        json.dump(dataset.to_list(), f, indent=2, ensure_ascii=False)\n",
    "\n",
    "save_to_json(reason_train_dataset, \"reason_train.json\")\n",
    "save_to_json(no_reason_train_dataset, \"no_reason_train.json\")\n",
    "save_to_json(test_dataset, \"mixed_test.json\")\n",
    "\n",
    "print(\"文件保存完成！\")\n",
    "print(f\"推理训练集: reason_train.json ({len(reason_train_dataset)}条)\")\n",
    "print(f\"非推理训练集: no_reason_train.json ({len(no_reason_train_dataset)}条)\")\n",
    "print(f\"混合测试集: mixed_test.json ({len(test_dataset)}条)\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 登记数据集"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "在dataset_info.json中添加\n",
    "\n",
    "\"reason_train\": {\n",
    "    \"file_name\": \"reason_train.json\"\n",
    "  },\n",
    "  \"no_reason_train\": {\n",
    "    \"file_name\": \"no_reason_train.json\"\n",
    "  },\n",
    "  \"mixed_test\": {\n",
    "    \"file_name\": \"mixed_test.json\"\n",
    "  },"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 打开llama factory判断是否可以识别到数据"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
